{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split \n",
    "import numpy as np\n",
    "import time\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 导入数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 计时器\n",
    "def timer (func):\n",
    "    def wrapper(*args,**kwargs): \n",
    "        start = time.time()\n",
    "        result = func(*args,**kwargs)\n",
    "        end = time.time()\n",
    "        print(func.__name__+'运行时间：',end-start)\n",
    "        return result\n",
    "    return wrapper\n",
    "\n",
    "@timer\n",
    "def load_train():\n",
    "    # 导入数据\n",
    "    y_train_df = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])\n",
    "    x_train_df = joblib.load('../semi_super/x_train.lz4')\n",
    "    x_train_df = x_train_df.fillna(-1)\n",
    "\n",
    "    x_train_arr = x_train_df.values\n",
    "    y_train_arr = y_train_df.values.ravel()\n",
    "    return x_train_arr,y_train_arr\n",
    "\n",
    "@timer\n",
    "def load_unlabel():\n",
    "    prob_unlabel_df = pd.read_csv('../predict_unlabel/preds/unlabel_pred.csv',usecols=['prob'])\n",
    "    x_unlabel_df = joblib.load('../semi_super/x_unlabel.lz4')\n",
    "    unlabel_df = pd.concat([x_unlabel_df,prob_unlabel_df],axis=1)\n",
    "    unlabel_df = unlabel_df.sort_values(by=['prob'],ascending=False)\n",
    "    unlabel_df = unlabel_df.reset_index(drop=True)\n",
    "    unlabel_df = unlabel_df.fillna(-1)\n",
    "\n",
    "\n",
    "    y_unlabel_df = unlabel_df[['prob']].copy()\n",
    "    x_unlabel_df = unlabel_df.drop(columns=['prob'])\n",
    "\n",
    "    y_unlabel_df.loc[:int(0.1*len(y_unlabel_df)),'prob'] = 1\n",
    "    y_unlabel_df.loc[int(0.1*len(y_unlabel_df)):,'prob'] = 0\n",
    "    # 按prob 由高到低排列\n",
    "    x_unlabel_arr = x_unlabel_df.values\n",
    "    y_unlabel_arr = y_unlabel_df.values.ravel()\n",
    "\n",
    "\n",
    "    # 颠倒一下正负样本的先后次序\n",
    "    x_unlabel_0 = x_unlabel_arr[int(0.1*len(x_unlabel_df)):]\n",
    "    x_unlabel_1 = x_unlabel_arr[:int(0.1*len(x_unlabel_df))]\n",
    "    x_unlabel_arr = np.vstack((x_unlabel_0,x_unlabel_1))\n",
    "\n",
    "    y_unlabel_0 = y_unlabel_arr[int(0.1*len(y_unlabel_df)):]\n",
    "    y_unlabel_1 = y_unlabel_arr[:int(0.1*len(y_unlabel_df))]\n",
    "    y_unlabel_arr = np.hstack((y_unlabel_0,y_unlabel_1))\n",
    "    \n",
    "    return x_unlabel_arr,y_unlabel_arr,-int(0.1*len(x_unlabel_df))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 选择合适的正负比例及数量的unlabel数据加入train集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import xgboost as xgb\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "import lightgbm as lgb\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "# @timer\n",
    "def gen_semiset(x_unlabel_arr,y_unlabel_arr,start=-6653,num=10,ratio=0.1):\n",
    "    num_0 = num-int(num*ratio)\n",
    "    num_1 = int(num*ratio)\n",
    "    tmp = x_unlabel_arr[start-num_0:start+num_1]\n",
    "    y_tmp = y_unlabel_arr[start-num_0:start+num_1]\n",
    "    return tmp,y_tmp\n",
    "\n",
    "# @timer\n",
    "def combine_data(x_train_arr,y_train_arr,tmp,y_tmp):\n",
    "\n",
    "\n",
    "    # 容易出错vstack hstack\n",
    "    x_train_tmp = np.vstack((x_train_arr,tmp))\n",
    "    y_train_tmp = np.hstack((y_train_arr,y_tmp))\n",
    "    return x_train_tmp,y_train_tmp\n",
    "\n",
    "# @timer\n",
    "def gen_local_train(x_unlabel_arr,y_unlabel_arr,x_train_arr,y_train_arr,\n",
    "                    start=-6653,num=10,ratio=0.1):\n",
    "    tmp,y_tmp = gen_semiset(x_unlabel_arr,y_unlabel_arr,start,num,ratio)\n",
    "    return combine_data(x_train_arr,y_train_arr,tmp,y_tmp)\n",
    "\n",
    "\n",
    "# @timer\n",
    "def train(x_train,y_train,x_test,y_test,model_name='rfc'):\n",
    "    #2.建立模型 \n",
    "#     model_names = ['rfc','gbc','xgb','lgb','dtc']\n",
    "    if model_name == 'rfc':\n",
    "        # RandomForestClassifier\n",
    "        \n",
    "        rfc  =RandomForestClassifier(n_estimators=10,\n",
    "                                     n_jobs =32,\n",
    "                                     max_features='sqrt',\n",
    "                                     class_weight='balanced',\n",
    "#                                      verbose =1,\n",
    "                                     random_state=2018)\n",
    "    elif model_name == 'gbc':\n",
    "        \n",
    "        gbc = GradientBoostingClassifier(loss='deviance',learning_rate =0.1,n_estimators=300,subsample=0.9,max_depth=3,verbose=1,random_state=2018)\n",
    "    elif model_name == 'xgb':\n",
    "        \n",
    "        params={\n",
    "        'booster':'gbtree',\n",
    "        'objective': 'binary:logistic',\n",
    "        'early_stopping_rounds':100,\n",
    "        'scale_pos_weight': float(len(data_label)-np.sum(data_label.values))/float(np.sum(data_label.values)),  # 负例样本除以正例样本\n",
    "        'eval_metric': 'auc',\n",
    "        'gamma':1,\n",
    "        'max_depth':6,\n",
    "        'lambda':1,\n",
    "        'subsample':0.9,\n",
    "        'colsample_bytree':0.9,\n",
    "        'min_child_weight':1, \n",
    "        'eta': 0.04,\n",
    "        'seed':2010,\n",
    "        'nthread':32\n",
    "            }\n",
    "    elif model_name == 'lgb':\n",
    "        \n",
    "        parameters = {\n",
    "            'boost':'gbdt',\n",
    "            'num_leaves':135, \n",
    "            'scale_pos_weight':float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),\n",
    "            'max_depth':-1,\n",
    "            'learning_rate':.05,\n",
    "            'max_bin':200,\n",
    "            'min_data_in_leaf' : 60,\n",
    "            'objective':'binary',\n",
    "            'metric':'auc',\n",
    "            'verbose':1,\n",
    "            'num_threads':36\n",
    "        }\n",
    "    elif model_name == 'dtc':\n",
    "        dtc = DecisionTreeClassifier(class_weight='balanced')\n",
    "    else:\n",
    "        pass\n",
    "\n",
    "    #3.训练模型\n",
    "#     print('开始训练！')\n",
    "    if model_name == 'rfc':\n",
    "        rfc.fit(x_train,y_train)\n",
    "    #     joblib.dump(rfc,'./model/rfc_model')\n",
    "    elif model_name == 'gbc':\n",
    "        gbc.fit(x_train,y_train)\n",
    "    #     joblib.dump(gbc,'./model/rgbc_model')\n",
    "    elif model_name =='xgb':\n",
    "        dtrain = xgb.DMatrix(x_train,y_train)\n",
    "        dvalid = xgb.DMatrix(x_test,y_test)\n",
    "        dtest = xgb.DMatrix(x_test)\n",
    "        watchlist  = [(dvalid,'valid')]\n",
    "        xgb_model = xgb.train(params,dtrain,num_boost_round=300,evals=watchlist)\n",
    "    #     xgb_model.save_model('./model/xgb_model')\n",
    "    elif model_name == 'lgb':\n",
    "        lgb_train = lgb.Dataset(x_train, y_train.ravel())\n",
    "        lgb_model = lgb.train(parameters,lgb_train,num_boost_round=50)\n",
    "    #     lgb_model.save_model('./model/lgb_model')\n",
    "    elif model_name == 'dtc':\n",
    "        dtc.fit(x_train,y_train)\n",
    "    else:\n",
    "        pass\n",
    "    # lg.fit(x_train,y_train.ravel())\n",
    "    # lsvm.fit(x_train,y_train)\n",
    "    # history = model.fit(x_train,y_train,epochs=2,batch_size=1024,class_weight = 'auto',validation_data=(x_test,y_test))\n",
    "     #4.预测结果\n",
    "#     print('开始预测！')\n",
    "    if model_name=='rfc':\n",
    "        y_pre = rfc.predict_proba(x_test)[:,1]\n",
    "    elif model_name=='gbc':\n",
    "        y_pre = gbc.predict_proba(x_test)[:,1]\n",
    "    elif model_name=='xgb':\n",
    "        dtest = xgb.DMatrix(x_test)\n",
    "        y_pre = xgb_model.predict(dtest).ravel()\n",
    "    elif model_name=='lgb':\n",
    "        y_pre = lgb_model.predict(x_test)\n",
    "    elif model_name == 'dtc':\n",
    "        y_pre = dtc.predict(x_test)[:,1]\n",
    "    else:\n",
    "        pass\n",
    "    \n",
    "    auc = metrics.roc_auc_score(y_test, y_pre)\n",
    "#     print('AUC:',auc)\n",
    "    return auc\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load_train运行时间： 10.11999797821045\n",
      "load_unlabel运行时间： 34.36000037193298\n"
     ]
    }
   ],
   "source": [
    "# 导入所有训练数据\n",
    "x_train_arr,y_train_arr = load_train()\n",
    "x_unlabel_arr,y_unlabel_arr,start = load_unlabel()\n",
    "START = start\n",
    "\n",
    "# 生成有效训练数据\n",
    "STEP = 6\n",
    "NUM = 15000\n",
    "x_train_new,y_train_new = gen_local_train(x_unlabel_arr,y_unlabel_arr,x_train_arr,y_train_arr,start=START,num=NUM,ratio=1/STEP)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((48465, 6704), (48465,))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train_new.shape,y_train_new.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### bagging train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SelectModel(model_name):\n",
    "    if model_name == 'XGB':\n",
    "        from xgboost import XGBClassifier\n",
    "\n",
    "        model = XGBClassifier(max_depth=6,\n",
    "                              learning_rate =0.04, \n",
    "                              booster='gbtree',\n",
    "                              objective='binary:logistic',\n",
    "                              early_stopping_rounds=100,\n",
    "                              scale_pos_weight=float(len(y_train_arr)-np.sum(y_train_arr))/float(np.sum(y_train_arr)),\n",
    "                              eval_metric='auc',\n",
    "                              gamma=1,\n",
    "                              reg_lambda=1,\n",
    "                              subsample=0.9,\n",
    "                              min_child_weight=1,\n",
    "                              seed=2018,\n",
    "                              silent=False,\n",
    "                              n_jobs=40,\n",
    "                              num_boost_round =300\n",
    "                             )\n",
    "    elif model_name == 'RFC':\n",
    "        from sklearn.ensemble import RandomForestClassifier\n",
    "        model = RandomForestClassifier(n_estimators=1500,\n",
    "                                       n_jobs =40,\n",
    "                                       max_features='sqrt',\n",
    "                                       class_weight='balanced',\n",
    "#                                        verbose =1,\n",
    "                                       random_state=2018)\n",
    "    elif model_name == 'LGB':\n",
    "        from lightgbm import LGBMClassifier\n",
    "        model = LGBMClassifier(boost='gbdt',\n",
    "                    num_leaves=135, \n",
    "                    scale_pos_weight=float(len(y_train_arr)-np.sum(y_train_arr))/float(np.sum(y_train_arr)),\n",
    "                    max_depth=-1,\n",
    "                    learning_rate=.04,\n",
    "                    max_bin=200,\n",
    "                    min_data_in_leaf= 60,\n",
    "                    objective='binary',\n",
    "                    metric='auc',\n",
    "                    num_threads=40,\n",
    "                    slient=False,\n",
    "                    num_boost_round =300)\n",
    "    else:\n",
    "        pass\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "XGB\n",
      "RFC\n",
      "LGB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "# 训练多个模型\n",
    "if not os.path.exists('./model'):\n",
    "    os.mkdir('model')\n",
    "model_list = ['XGB','RFC','LGB']\n",
    "for model in model_list:\n",
    "    print(model)\n",
    "    clf = SelectModel(model)\n",
    "    clf.fit(x_train_arr,y_train_arr)\n",
    "    joblib.dump(clf,'./model/{}'.format(model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 预测结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入valid数据\n",
    "tag_files = ['predict_tag','predict_tag_new']\n",
    "TAG_FILE = tag_files[1]\n",
    "def load_valid(tag_file): \n",
    "    valid_date = pd.read_csv('../../preprocess_data/valid_date.csv').drop(columns=['id','loan_hour'])\n",
    "    valid_raw = joblib.load('../../preprocess_data_new/valid_nodup.lz4').drop(columns=['id','loan_dt'])\n",
    "    valid_tag = pd.read_csv('../{}/valid_tag.csv'.format(tag_file),usecols=['tag'])\n",
    "    valid_null = pd.read_csv('../../preprocess_data_new/valid_row_null.csv').drop(columns=['id'])\n",
    "    maj_cnt_test = joblib.load('../../preprocess_data_discrete/maj_cnt_test.lz4')\n",
    "\n",
    "    valid = pd.concat([valid_date,valid_raw,valid_tag,valid_null,maj_cnt_test],axis=1)\n",
    "    valid = valid.fillna(-1)\n",
    "    x_test_arr = valid.values\n",
    "    return x_test_arr\n",
    "\n",
    "x_test_arr = load_valid(TAG_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 参数\n",
    "valid_save_path = './{}_valid_preds'.format(TAG_FILE)\n",
    "model_list = ['XGB','RFC','LGB']\n",
    "valid_id = pd.read_csv('../../preprocess_data_new/valid_date.csv',usecols=['id']).values.ravel()\n",
    "\n",
    "def predict_score(save_path,model_list,id):\n",
    "    if not os.path.exists(save_path):\n",
    "        os.mkdir(save_path)\n",
    "    # model_list = \n",
    "    for model in model_list:\n",
    "        pred = pd.DataFrame()\n",
    "        pred['id'] = id\n",
    "        clf = joblib.load('./model/{}'.format(model))\n",
    "        pred['score'] = clf.predict_proba(x_test_arr)[:,1]\n",
    "        pred.to_csv(os.path.join(save_path,'{}.csv'.format(model)),index=False)\n",
    "        \n",
    "predict_score(valid_save_path,model_list,valid_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_pred_dir = valid_save_path\n",
    "\n",
    "def combine_score_to_prob(pred_dir,model_list,test_id):\n",
    "    pred_path = os.path.join(pred_dir,'{}.csv'.format(model_list[0]))\n",
    "    score = pd.read_csv(pred_path)['score']\n",
    "    for model in model_list[1:]:\n",
    "        pred_path = os.path.join(pred_dir,'{}.csv'.format(model))\n",
    "        score += pd.read_csv(pred_path).score\n",
    "    score = score/len(model_list)\n",
    "    pred = pd.DataFrame()\n",
    "    pred['id'] = test_id\n",
    "    pred['prob'] = score.values\n",
    "    pred.to_csv('./{}_bagging_pred.csv'.format(TAG_FILE),index=False)\n",
    "\n",
    "combine_score_to_prob(valid_pred_dir,model_list,valid_id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "predict_tag_bagging_pred  AUC:0.81929804644399  \n",
    "predict_tag_new_bagging_pred AUC:0.8185711623187"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
